
import pandas as pd
from _funcs import parallel_read_csv, get_chunks, read_csv_chunk

dir = r'/Volumes/Zihao_SSD2/PatentsView/'

## ===========================================================================================================
## Robustness check: Validation using random sample of 5 patents (not among top5 list)
## Zihao Li. 06/2024

## Inputs:  g_us_patent_citation.tsv (from PatentsView)
##          cleandata/sim_score_1981_2015_top5.csv
##          cleandata/sim_score_1981_2015_sample.csv

## Outputs: temp/omission_panel5_randsample.csv
## ===========================================================================================================

def main():
    print('Loading artificial citation data...')
    df_artificial = pd.read_csv(dir + 'cleandata/sim_score_1981_2015_sample.csv')
    df_artificial_top5 = pd.read_csv(dir + 'cleandata/sim_score_1981_2015_top5.csv')

    # Exclude patents that are among the top five
    merged_df = pd.merge(
        df_artificial, 
        df_artificial_top5[['patent_idx', 'cited_patent_idx']], 
        on=['patent_idx', 'cited_patent_idx'], 
        how='left', 
        indicator=True
        )
    df_artificial = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
    print(df_artificial.shape) # (183123722, 7)
    del df_artificial_top5, merged_df

    # Randomly sample five patents for each focal patent
    df_artificial = df_artificial.groupby('patent_idx', group_keys=False).apply(pd.DataFrame.sample, n=5, random_state=42)
    print(df_artificial.shape) # (9065575, 7)
    df_artificial = df_artificial.drop(columns=['patent_idx', 'cited_patent_idx', 'patent_year', 'cited_patent_year'])


    ## Load actual citation data
    print('Loading actual citation data...')
    df_actual = parallel_read_csv(dir + 'rawdata/g_us_patent_citation.tsv', file_type='tsv')
    print('Shape of actual_citation dataset is {}.'.format(df_actual.shape)) # (128401915, 7)

    ## Special characters and non-numerics
    print('Removing non-numeric patents and converting patent_id to numeric...')
    df_actual = df_actual[~df_actual["patent_id"].str.contains("[a-zA-Z]")]
    mask = df_actual["citation_patent_id"].str.contains("[^0-9.]")
    mask = mask.astype(bool)
    df_actual = df_actual[~mask]
    df_actual["patent_id"] = pd.to_numeric(df_actual["patent_id"], errors="raise")
    df_actual["citation_patent_id"] = pd.to_numeric(df_actual["citation_patent_id"], errors="raise")
    print('Shape of actual_citation dataset (after removing non-numeric patents) is {}.'.format(df_actual.shape)) # (113603267, 7)

    ## Generate actual citation list
    print('Generating actual citation list...')
    df_actual_lst = df_actual.groupby('patent_id').agg({'citation_patent_id': lambda x: x.tolist()}).reset_index().sort_values(by=['patent_id'])
    df_actual_lst['citation_patent_id'] = [row[0] if isinstance(row[0], list) else row for row in df_actual_lst['citation_patent_id']]
    df_actual_lst = df_actual_lst.rename(columns={'citation_patent_id': 'actual_citation_list'})
    df_actual_lst['num_citations'] = df_actual_lst["actual_citation_list"].apply(lambda x: len(x))
    print('Shape of actual_citation dataset (after grouping by patent_id) is {}.'.format(df_actual_lst.shape)) # (6719417, 3)

    # Merge
    print('Merging actual_citation_list with artificial citation data...')
    df = df_artificial.merge(df_actual_lst, how='left', on='patent_id'); del df_actual_lst, df_artificial 
    print('Shape of merged dataset is {}.'.format(df.shape)) # (24766255, 5)

    ## Generate omission index
    print('Generating omission index...')
    df = df.dropna(subset=['actual_citation_list'])
    df['omission'] = df.apply(lambda row: 0 if row['cited_patent_id'] in row['actual_citation_list'] else 1, axis=1)
    print('Shape of omission dataset is {}.'.format(df.shape)) # (23754000, 6)

    ## Export omission panel
    print('Exporting omission panel...')
    df.to_csv(dir + 'temp/omission_panel5_randsample.csv', index=False)


if __name__ == "__main__":
    main()